################################################################################
##
## Purpose: This script estimates a range of topic models.
##
## Author: James Bisbee (james.h.bisbee@vanderbilt.edu)
##
##  - Inputs:
##    - ./data/prepped/finalData_for_NLP.RData: Prepped data from 6_DATA_intermediate_build.R
##  - Outputs:
##    - ./data/prepped/hearings/topic_robustness.RData
##    - ./data/prepped/hearings/topic_models_100.RData
##    - ./data/prepped/hearings/topic_robustness_JOPRR1_Grped.RData
##    - ./data/prepped/hearings/topic_robustness_JOPRR1_spkr.RData
##    - ./data/prepped/hearings/stm_results.RData
##    - ./data/prepped/finalData_70kGrped.RData
##
##
## See associated log file for compute environment, package versions, 
##  and date of most recent run.
##
################################################################################
rm(list = ls())
gc()
set.seed(12345)
require(text2vec)
require(textstem)
require(stopwords)
require(tidyverse)
require(textmineR)
require(stm)
require(tm)

# Compute details
print(paste0('Compute environment from ',Sys.Date(),' run by Bisbee'))
if(Sys.info()['sysname'] == 'Windows') {
  ram_size = system("wmic MemoryChip get Capacity", intern = TRUE)[-1]
  model_name = system("wmic cpu get name", intern = TRUE)[2] # nocov
  vendor_id = system("wmic cpu get manufacturer", intern = TRUE)[2] # nocov
  
  print(list(ram = stringr::str_squish(ram_size)[1],
             vendor_id = stringr::str_squish(vendor_id),
             model_name = stringr::str_squish(model_name),
             no_of_cores = parallel::detectCores()))
} else if(Sys.info()['sysname'] == 'Linuxs') {
  splitted <- strsplit(system("ps -C rsession -o %cpu,%mem,pid,cmd", intern = TRUE), " ")
  df <- do.call(rbind, lapply(splitted[-1], 
                              function(x) data.frame(
                                cpu = as.numeric(x[2]),
                                mem = as.numeric(x[4]),
                                pid = as.numeric(x[5]),
                                cmd = paste(x[-c(1:5)], collapse = " "))))
  df
} else {
  cat("If not on Linux or Windows, you'll have to figure out your own solution to seeing the compute environment.")
}

sessionInfo()


prep_fun = function(x) {
  x = str_to_lower(x)
  x = str_replace_all(x, "'", "")
  x = str_replace_all(x, "[^[:alpha:]]", " ")
  x = str_replace_all(x, "\\s+", " ")
}

# Topic model estimated on speaker-hearing concatenated text
load('./data/prepped/finalData_for_NLP.RData')

text <- finalMerge %>%
  mutate(textclean = lemmatize_strings(prep_fun(textclean))) %>%
  filter(!is.na(speaker)) %>%
  select(fullInd,textclean)

trainText <- text$textclean[1:6000]
testText <- text$textclean[6001:nrow(text)]

it = itoken(trainText,ids = text$fullInd[1:6000],progressbar = FALSE)
v = create_vocabulary(it,stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .2,term_count_min = 20)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
tcm <- create_tcm(it,vectorizer,skip_grams_window = 10L)

it = itoken(testText,ids = text$fullInd[6001:nrow(text)],progressbar = FALSE)
dtmTest = create_dtm(it,vectorizer)

perplx <- cohres <- NULL
for(k in c(10,20,30,50,100,150,200,300,400,500,750,1000)) {
  set.seed(123)
  lda_model = LDA$new(n_topics = k, doc_topic_prior = 0.1, topic_word_prior = 0.01)
  doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)
  tw = lda_model$get_top_words(n = 10, lambda = 1)
  
  # Coherence
  cohTmp <- coherence(tw, tcm, n_doc_tcm = attr(v, 'document_count'))
  cohres <- bind_rows(data.frame(cohTmp) %>%
                        mutate(topic = row.names(.),
                               k = k),cohres)
  
  # Perplexity
  new_doc_topic_distr = lda_model$transform(dtmTest)
  perplx <- bind_rows(perplx,data.frame(k = k,
                                        perplexity = perplexity(dtmTest, 
                                                                topic_word_distribution = lda_model$topic_word_distribution, 
                                                                doc_topic_distribution = new_doc_topic_distr)))
  cat(k,'\n')
}


save(perplx,cohres,file = './data/prepped/hearings/topic_robustness.RData')

perplx %>%
  ggplot(aes(x = k,y = perplexity)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = 100)

# Looking at the results
cohres %>%
  filter(!is.infinite(mean_logratio),
         k > 0) %>%
  group_by(k) %>%
  # slice_max(mean_logratio,n = 100) %>%
  summarise_all(mean,na.rm=T) %>%
  ggplot(aes(x = k,y = mean_logratio)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = 100) + 
  ylab('Coherence (avg. log ratio)')


# Choosing the best
it = itoken(text$textclean,ids = text$fullInd,progressbar = FALSE)
v = create_vocabulary(it,ngram = c(1,2),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 10)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
set.seed(123)
lda_model = LDA$new(n_topics = 100, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)

doc_topic_distr <- doc_topic_distr %>%
  data.frame() %>%
  mutate(id =  text$fullInd) %>%
  as_tibble() %>%
  gather(topic,theta,-id) %>%
  mutate(topic = gsub('X','',topic))


save(doc_topic_distr,lda_model,file = './data/prepped/hearings/topic_models_100.RData')





# STM version
text <- finalMerge %>%
  mutate(interrupted = ifelse(grepl('--$',textclean),1,0),
         textclean = lemmatize_strings(prep_fun(textclean))) %>%
  filter(!is.na(speaker))  %>% 
  select(textclean,chamber,opensecretsID,position,party,nKids,nDaughters,nSons,votepct_rel,gender,age,seniority,nominate_dim1,fullInd,
         interrupted)

text %>%
  count(interrupted)

text$party[which(is.na(text$party))] <- text$position[which(is.na(text$party))]
text <- text %>%
  mutate(comparisons = ifelse(grepl('FED',opensecretsID),opensecretsID,
                              ifelse(chamber == 'Senate',paste0('Senate: ',party),
                                     paste0('House: ',party))))

processed <- textProcessor(documents = text$textclean,metadata = text)
out <- prepDocuments(processed$documents,processed$vocab,processed$meta)
fit <- stm(documents = out$documents,vocab = out$vocab,
           K = 100,prevalence = ~ comparisons + interrupted + s(nominate_dim1) + age + seniority + gender,
           max.em.its = 75,data = out$meta,init.type = 'LDA')

prep <- estimateEffect(1:100 ~ comparisons + interrupted + s(nominate_dim1) + age + seniority + gender,
                       fit,metadata = out$meta,uncertainty = 'Global')


save(prep,fit,out,file = './data/prepped/hearings/stm_results.RData')


# RR1 work
topicModel_fun <- function(train,test,text,id,ks = c(10,20,30,50,70,100,150,200,300,400,500,750,1000)) {
  set.seed(1234)
  it = itoken(train[[text]],ids = train[[id]],progressbar = FALSE)
  v = create_vocabulary(it,stopwords = c(gsub("'",'',stopwords())))
  v = prune_vocabulary(v,doc_proportion_max = .2,term_count_min = 20)
  vectorizer = vocab_vectorizer(v)
  dtm = create_dtm(it,vectorizer)
  tcm <- create_tcm(it,vectorizer,skip_grams_window = 20L)
  
  it = itoken(test[[text]],ids = test[[id]],progressbar = FALSE)
  dtmTest = create_dtm(it,vectorizer)
  
  perplx <- cohres <- NULL
  for(k in ks) {
    lda_model = LDA$new(n_topics = k, doc_topic_prior = 0.1, topic_word_prior = 0.01)
    doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)
    tw = lda_model$get_top_words(n = 10, lambda = 1)
    
    # Coherence
    cohTmp <- coherence(tw, tcm, n_doc_tcm = attr(v, 'document_count'))
    cohres <- bind_rows(data.frame(cohTmp) %>%
                          mutate(topic = row.names(.),
                                 k = k),cohres)
    
    # Perplexity
    new_doc_topic_distr = lda_model$transform(dtmTest)
    perplx <- bind_rows(perplx,data.frame(k = k,perplexity = perplexity(dtmTest, topic_word_distribution = lda_model$topic_word_distribution, doc_topic_distribution = new_doc_topic_distr)))
    cat(k,'\n')
  }
  return(list(perplexity = perplx,
              coherence = cohres))
}

# Topic model estimated on speaker-hearing concatenated text (JOP RR1)
load('./data/prepped/finalData_for_NLP.RData')

text <- finalMerge %>%
  select(docID,fullInd,opensecretsID,textclean,nchars) %>%
  group_by(opensecretsID) %>%
  arrange(opensecretsID,fullInd) %>%
  mutate(over = nchars > 1000) %>%
  group_by(opensecretsID) %>%
  mutate(rwID = cumsum(over != lag(over, default = over[1]))) %>%
  group_by(opensecretsID,rwID) %>%
  mutate(cumchars = cumsum(nchars)) %>%
  ungroup() %>%
  mutate(group_id = cumsum(cumchars > 1000) + 1) %>%
  mutate(rwID = ifelse(lag(cumchars) < 1000,lag(rwID),rwID)) %>%
  group_by(opensecretsID,rwID) %>%
  mutate(combText = paste(textclean,collapse = '. ')) %>%
  mutate(rwID = ifelse(is.na(rwID),-1,rwID)) %>%
  ungroup()

text2 <- text %>%
  ungroup() %>%
  select(docID,opensecretsID,rwID,combText) %>%
  distinct() %>%
  mutate(textclean = lemmatize_strings(prep_fun(combText))) %>%
  filter(!is.na(opensecretsID)) %>%
  mutate(nchars = nchar(textclean)) %>%
  mutate(uniqueID = paste0(docID,opensecretsID,rwID)) %>%
  select(uniqueID,textclean)


inds <- sample(1:nrow(text2),size = round(nrow(text2)*.5),replace = F)
train <- text2 %>%
  slice(inds)
test <- text2 %>%
  slice(-inds)

tmRes_Grped <- topicModel_fun(train,test,text = 'textclean',id = 'uniqueID')


save(tmRes_Grped,file = './data/prepped/hearings/topic_robustness_JOPRR1_Grped.RData')


# Choosing the best
it = itoken(text2$textclean,ids = text2$uniqueID,progressbar = FALSE)
v = create_vocabulary(it,ngram = c(1,2),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 10)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
lda_model = LDA$new(n_topics = 70, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)



doc_topic_distr <- doc_topic_distr %>%
  data.frame() %>%
  mutate(id =  text2$uniqueID) %>%
  as_tibble() %>%
  gather(topic,theta,-id) %>%
  mutate(topic = gsub('X','',topic))

text %>%
  mutate(uniqueID = paste0(docID,opensecretsID,rwID)) %>%
  count(uniqueID)

doc_topic_distr %>%
  spread(topic,theta,sep = '70Grped_') %>%
  rename(uniqueID = id) %>%
  count(uniqueID)

textToMerge <- text %>%
  mutate(uniqueID = paste0(docID,opensecretsID,rwID)) %>%
  left_join(doc_topic_distr %>%
              spread(topic,theta,sep = '70Grped_') %>%
              rename(uniqueID = id))

utterance_level <- finalMerge %>%
  left_join(textToMerge %>%
              select(docID,opensecretsID,fullInd,matches('topic70Grped')))

# Calculate speaker topics instead
text <- utterance_level %>%
  select(docID,fullInd,opensecretsID,textclean,nchars) %>%
  group_by(docID,opensecretsID) %>%
  summarise(textclean = paste(textclean,collapse = '. ')) %>%
  ungroup() %>%
  mutate(textclean = lemmatize_strings(prep_fun(textclean)),
         uniqueID = paste0(docID,opensecretsID))

inds <- sample(1:nrow(text),size = round(nrow(text)*.8),replace = F)
train <- text %>%
  slice(inds)
test <- text %>%
  slice(-inds)

tmRes_spkr <- topicModel_fun(train,test,text = 'textclean',id = 'uniqueID')

save(tmRes_spkr,file = './data/prepped/hearings/topic_robustness_JOPRR1_spkr.RData')


# Choosing the best
it = itoken(text$textclean,ids = text$uniqueID,progressbar = FALSE)
v = create_vocabulary(it,ngram = c(1,2),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 10)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
lda_model = LDA$new(n_topics = 70, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)


doc_topic_distr <- doc_topic_distr %>%
  data.frame() %>%
  mutate(id =  text$uniqueID) %>%
  as_tibble() %>%
  gather(topic,theta,-id) %>%
  mutate(topic = gsub('X','',topic))

textToMerge <- text %>%
  left_join(doc_topic_distr %>%
              spread(topic,theta,sep = '70Spkr_') %>%
              rename(uniqueID = id))

utterance_level <- utterance_level %>%
  left_join(textToMerge %>%
              select(docID,opensecretsID,matches('topic70Spkr')))


topics_toMerge <- utterance_level %>%
  select(docID,opensecretsID,ind,matches('topic'))

save(topics_toMerge,file = './data/prepped/hearings/topic_models_grp_spkr.RData')


# EOF
